#This code produces table 3 as well as the topic proportions (which is reported in the appendix)

if(!dir.exists("figs")){dir.create("figs")}
if(!dir.exists("tabs")){dir.create("tabs")}

library(tidyverse)
library(arabicStemR)
library(tidytext)
library(topicmodels)
library(lubridate)
library(ggplot2)
library(quanteda)
library(stm)
library(stats)
library(ggthemes)
library(ggpubr)

rm(list = ls())

# Reading data ------------------------------------------------------------
load("k40.RData")

beta = tidy(topic_model)
gamma = tidy(topic_model, matrix = "gamma")

meta = meta %>% mutate(document = article) %>% select(- article)

gamma = left_join(gamma, meta, by = "document")
gc()

# reorder_within <- function(x, by, within, fun = mean, sep = "___", ...) {
#   new_x <- paste(x, within, sep = sep)
#   stats::reorder(new_x, by, FUN = fun)
# }
# 
# scale_x_reordered <- function(..., sep = "___") {
#   reg <- paste0(sep, ".+$")
#   ggplot2::scale_x_discrete(labels = function(x) gsub(reg, "", x), ...)
# }


# Naming topics -----------------------------------------------------------
topic_avg = gamma %>% 
  group_by(topic) %>% 
  summarise(avg = mean(gamma)) %>% 
  arrange(-avg)

terms = labelTopics(topic_model, n = 30)

topic_terms = terms$prob %>% as_tibble %>% mutate(topic = row_number()) %>% unite("terms", V1:V30, sep = ",") %>% select(topic, terms)


df = topic_terms %>% left_join(topic_avg) %>% 
  mutate(topic_name = c("communication", "assad", "isis", "temperature", "elections", 
                        "weather", "foreign fighters", "united states", "announcements", "conspiracies and plots", 
                        "election winners", "legislation", "terrorist attacks", "speeches", "victims", 
                        "gulf", "accident", "media", "europe", "yemen", 
                        "culture and sports", "capitulation", "economy", "diplomacy", "religion", 
                        "transport", "natural resources", "national unity", "lebanon", "finance", 
                        "bureaucracy", "education", "fighting terrorism", "Israel and Palestine", "ba'th party",
                        "Iraq", "Turkey", "russia", "Iran", "regional politics")) %>% 
  mutate(topic_name = str_to_title(topic_name)) %>% 
  select(topic_name, terms, avg) %>% 
  rename("Topic Label" = "topic_name", "High Proabbility Terms" = "terms", "Expected Proportion" = "avg")

df = df %>% mutate(`High Proabbility Terms` = c("add, call, say, information, electronic, details",
                              "president, mister, Assad, Bashar, Arab, Syria",
                              "terrorism, organization, Syria, ISIS, state, armed",
                              "[regions in Syria], degree, temperature, increase, averages, high, low",
                              "election, constitution, people, presidency, council, candidate",
                              "[regions in Syria], agriculture, reach, rain, season",
                              "Newspaper, British, Tunisian, French, intelligence",
                              "America, United, States, president, Washington, military",
                              "first, public, SANA, yesterday, thawrah, Damascus",
                              "people, resistance, Zionist, enemy, conpiracy, interference",
                              "[winners' names], regional, doctor, MP, council, committee",
                              "article, law, decree, ruling, number, court",
                              "armed, group, terrorist, elements, fire",
                              "said, politics, states, should, change, discuss, people",
                              "aid, humanitarian, crisis, support, hungry, camp",
                              "Saudi, Egypt, Qatar, regime, gulf, opposition, kingdom",
                              "bombing, crime, children, civilian, assault",
                              "media, ask, truth, channel, report, events, news",
                              "Europe, union, France, nations, Germany, nuclear",
                              "Yemen, enemy, Saudi, army, raid, bombing",
                              "culture, Syria, world, exhibition, modern, sport",
                              "Aleppo, city, people, army, terrorism, reconciliation, Homs",
                              "economy, Syria, sector, trade, invest, tourism",
                              "relation, visit, state, delegation, meeting, foreign",
                              "religion, Islam, Patriarch, peace, Christian, Muslim",
                              "Jordan, sea, fly, border, transport, airport",
                              "oil, water, project, electricity, station, gas",
                              "nation, Syria, martyrs, army, sons, people, land, victory",
                              "Lebanon, resistance, Israel, president, nation, enemy",
                              "money, public, firm, Lira, bank, dollar",
                              "council, project, work, public, manage, minister",
                              "university, education, student, study, higher, public",
                              "terror, source, army, armed, organization, destruction, kill",
                              "Israel, Palestine, occupation, strip (Gaza), Jerusalem, settler",
                              "nation (watan; qawm), party, Arab, leadership, people, Ba'th",
                              "Iraq, Baghdad, kill, America, injure, city",
                              "Turkey, Erdogan, party, government, regime, development",
                              "Russia(n), Moscow, united, Foreign, Lavrov",
                              "Iran, state, region, Islam, Tehran, nuclear",
                              "state, Arab, Israel, peace, decision, conference"))


df %>% arrange(desc(`Expected Proportion`)) %>% 
  xtable::xtable(caption = "Topic labels, high probability terms for each topic, and their expected proportion",
                 label = "tab:topics") %>% 
  xtable::print.xtable(include.rownames = F, file = "tabs/topics.tex")


# Topics proportions ------------------------------------------------------
library(ggplot2)
library(ggthemes)

df %>%
  arrange(desc(`Expected Proportion`)) %>% 
  mutate(`Topic Label` = factor(`Topic Label`, levels = `Topic Label`),
         `Topic Label` = factor(`Topic Label`, levels = rev(levels(`Topic Label`)))) %>% 
  ggplot(aes(x = `Topic Label`, y = `Expected Proportion`)) + 
  geom_col(width = .1) + 
  coord_flip() + 
  theme_few() + 
  labs(y = "Expected Topic Proportions", x = "")

ggsave("figs/topic_proportions.pdf", height = 6, width = 6)




